Things to add to report

  • Hypothesis that there should be 3 groups - light-med, heavy, and synthetic
In [163]:
import pandas as pd
import numpy as np
from IPython.display import display,  Image
from datetime import datetime
import requests
import os
import joblib

from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import MiniBatchKMeans, AgglomerativeClustering
from sklearn.metrics.cluster import contingency_matrix
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import (
    silhouette_score,
    davies_bouldin_score,
    calinski_harabasz_score, 
    adjusted_rand_score,
    normalized_mutual_info_score,
    fowlkes_mallows_score,
    silhouette_samples,
    # classification_report,
    confusion_matrix
)
from scipy.optimize import linear_sum_assignment

import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from IPython.display import Image

pd.set_option('future.no_silent_downcasting', True)
alt.data_transformers.disable_max_rows()
Out[163]:
DataTransformerRegistry.enable('default')

Data Source¶

Crude Dict¶

In [164]:
crude_oils_dict = {
    'MSW Feeder':{'min_date':'1999-11-12','grades':{'Federated':'FD','Light Smiley':'MSY','Peace':'MPR','Pembina':'P','Secure Sask Light':'MSE',}},
    'Light Sweet':{'min_date':'1999-11-25','grades':{'Mixed Sweet Blend':'MSW','Rainbow':'RA',}},
    'Medium Sour':{'min_date':'1900-01-00','grades':{'Medium Gibson Sour':'MGS','Midale':'MSM','Peace Pipe Sour':'SPR',}},
    'Pooled Crudes - ex Superior':{'min_date':'1900-01-00','grades':{'Conventional Heavy':'CHV','Hardisty Synthetic Crude':'HSC','Light Sour Blend':'LSB','Medium Sour Blend':'MSB','Mixed Sweet Blend':'Superior','Premium Conventional Heavy':'PCH','Premium Synthetic':'PSY','Synbit Blend':'SYB','Synthetic Sweet Blend':'SYN',}},
    'Light Sour':{'min_date':'1900-01-00','grades':{'BC Light':'BCL','Boundary Lake':'BDY','Koch Alberta':'CAL','Moose Jaw Tops':'MJT','Pembina Light Sour':'PLS',}},
    'Sweet Synthetic':{'min_date':'1900-01-00','grades':{'CNRL Light Sweet Synthetic':'CNS','Husky Synthetic Blend':'HSB','Long Lake Light Synthetic':'PSC','Premium Albian Synthetic':'PAS','Shell Synthetic Light':'SSX','Suncor Synthetic A':'OSA','Syncrude Sweet Premium':'SSP',}},
    'Heavy Sour - Conventional':{'min_date':'1900-01-00','grades':{'Bow River North':'BRN','Bow River South':'BRS','Clearwater Heavy':'CWH','Fosterton':'F','Lloyd Blend':'LLB','Lloyd Kerrobert':'LLK','Seal Heavy':'SH','Smiley-Coleville':'SC','Wabasca Heavy':'WH','Western Canadian Blend':'WCB',}},
    'Heavy Sour - Unconventional':{'min_date':'1900-01-00','grades':{'Access Western Blend':'AWB','Borealis Heavy Blend':'BHB','Canadian Natural High TAN':'CNX','Christina Dilbit Blend':'CDB','Cold Lake':'CL','Fort Hills Dilbit':'FRB','Kearl Lake':'KDB','Leismer Corner Blend':'LCB','Long Lake Heavy Dilbit':'PDH','Surmont Heavy Dilbit':'SHD','Western Canada Dilbit':'WDB','Western Canadian Select':'WCS',}},
    'Heavy Sour - Synbit':{'min_date':'1900-01-00','grades':{'Long Lake Heavy':'PSH','PetroChina Blend':'PXB','Statoil Cheecham Synbit':'SCS','Surmont Heavy Blend':'SHB','Surmont Mix A':'SMA',}},
    'Heavy Sour - Partially Upgraded':{'min_date':'1900-01-00','grades':{'Albian Heavy Synthetic':'AHS',}},
    'Heavy Low Resid':{'min_date':'1900-01-00','grades':{'Suncor Synthetic H':'OSH',}},
}

Data Import Function¶

In [165]:
# Function to create a dataframe from historical crude oil properties by crude_grade
def get_data(crude_grade, crude_oils_dict):
    # Create URL parts
    start_date = crude_oils_dict[crude_grade]['min_date'] # Start date is the min_date defined in the crude_oils_dict for each crude_grade
    end_date = datetime.now().date().strftime('%Y-%m-%d') # End date is the current date
    base_url = 'https://www.crudemonitor.ca/api/1.1/json.php?'
    crude_properties_url = '&crudeProperties%5B0%5D=crudes-BA&crudeProperties%5B1%5D=crudes-LE&crudeProperties%5B2%5D=crudes-HTSD'
    crudes_url = ''.join([f'&crudes%5B{i}%5D={x.replace(' ','+')}' for i, x in enumerate([k for k in crude_oils_dict[crude_grade]['grades'].keys()])])
    date_url = f'&date%5Bstart%5D={start_date}&date%5Bend%5D={end_date}'

    # Merge url parts into full url
    url = base_url + crudes_url + crude_properties_url + date_url

    # Request data from API
    response = requests.get(url)
    response.raise_for_status()

    # Create dataframe
    response_df = pd.DataFrame.from_dict(response.json())

    # Replace values
    response_df = response_df.replace({'': np.nan, 'ND': np.nan}).infer_objects(copy=False)

    # Correct Location names
    response_df['Location'] = response_df['Location'].str.replace(r'(?i)milk.+river', 'Milk River', regex=True)
    
    # Change numeric columns datatypes
    num_cols = response_df.columns[4:]
    response_df[num_cols] = response_df[num_cols].apply(pd.to_numeric, errors='coerce')

    # Downcast numeric columns where possible
    for col in response_df.select_dtypes(include=['float64']).columns:
        response_df[col] = pd.to_numeric(response_df[col], downcast='float') 
    for col in response_df.select_dtypes(include=['int64']).columns:
        response_df[col] = pd.to_numeric(response_df[col], downcast='integer')

    # Convert Sample Date (yyyy-mm-dd) column to datetime
    response_df['Sample Date (yyyy-mm-dd)'] = pd.to_datetime(response_df['Sample Date (yyyy-mm-dd)'])

    # Add target labels to df
    response_df = response_df.assign(target_label=crude_grade)

    # Fix Location
    response_df.loc[response_df['Crude']=='Pembina Light Sour','Location'] = 'Edmonton'

    return response_df

# Load df from file
if os.path.exists('df.csv'):
    df = joblib.load('df.csv')
else:
    df = pd.concat(
        [get_data(crude_grade, crude_oils_dict) for crude_grade in crude_oils_dict.keys()],
        ignore_index=True,
        copy=False
    )
    joblib.dump(df, 'df.csv')

# Display the dataset
df
Out[165]:
Crude Batch Sample Date (yyyy-mm-dd) Location Density (kg/m³) [ASTM D5002] Gravity (°API) [ASTM D5002] Sulphur (wt%) [ASTM D4294] Micro Carbon Residue (wt%) [ASTM D4530] Sediment (ppmw) [ASTM D4807] Total Acid Number (mgKOH/g) [ASTM D664] ... 91 Mass% Recovered (°C) [ASTM D7169] 92 Mass% Recovered (°C) [ASTM D7169] 93 Mass% Recovered (°C) [ASTM D7169] 94 Mass% Recovered (°C) [ASTM D7169] 95 Mass% Recovered (°C) [ASTM D7169] 96 Mass% Recovered (°C) [ASTM D7169] 97 Mass% Recovered (°C) [ASTM D7169] 98 Mass% Recovered (°C) [ASTM D7169] 99 Mass% Recovered (°C) [ASTM D7169] target_label
0 Federated FD-263 2001-11-21 Edmonton 825.400024 39.799999 0.36 1.30 NaN NaN ... 605.099976 629.900024 670.900024 NaN NaN NaN NaN NaN NaN MSW Feeder
1 Federated FD-227 2001-12-11 Edmonton 826.599976 39.500000 0.34 1.10 NaN NaN ... 551.200012 564.799988 577.700012 590.500000 606.400024 630.000000 684.000000 NaN NaN MSW Feeder
2 Federated FD-302 2002-01-22 Edmonton 828.900024 39.099998 0.36 1.40 NaN NaN ... 563.000000 573.299988 584.799988 597.000000 610.099976 625.900024 645.200012 670.200012 710.099976 MSW Feeder
3 Federated Jul 99 - Nov 99 1999-11-12 Enbridge 825.700012 39.700001 0.31 1.51 NaN NaN ... 537.000000 552.000000 550.000000 563.000000 579.000000 599.000000 626.000000 643.000000 650.000000 MSW Feeder
4 Federated Nov 99 - Jan 00 2000-01-16 Enbridge 827.299988 39.299999 0.34 1.49 NaN NaN ... 591.000000 600.000000 562.000000 579.000000 565.000000 579.000000 595.000000 615.000000 639.000000 MSW Feeder
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8556 Suncor Synthetic H OSH-1041 2025-05-02 Hardisty 938.099976 19.200001 3.24 0.70 NaN 3.51 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN Heavy Low Resid
8557 Suncor Synthetic H OSH-1098 2025-06-04 Hardisty 938.500000 19.100000 3.20 0.50 20.0 3.68 ... 493.500000 498.500000 504.100006 510.899994 519.400024 533.200012 552.599976 581.299988 624.900024 Heavy Low Resid
8558 Suncor Synthetic H OSH-1086 2025-07-05 Hardisty 937.700012 19.299999 3.27 1.30 NaN 3.63 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN Heavy Low Resid
8559 Suncor Synthetic H OSH-1560 2025-08-01 Hardisty 937.400024 19.299999 3.18 0.70 NaN 3.40 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN Heavy Low Resid
8560 Suncor Synthetic H OSH-1012 2025-09-02 Hardisty 938.200012 19.200001 3.24 1.00 30.0 3.41 ... 496.299988 501.799988 508.399994 516.700012 528.500000 545.599976 572.000000 610.599976 661.000000 Heavy Low Resid

8561 rows × 133 columns

Data Summary¶

Dataframe Shape & Datatypes¶

In [166]:
df.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8561 entries, 0 to 8560
Data columns (total 133 columns):
 #    Column                                   Dtype         
---   ------                                   -----         
 0    Crude                                    object        
 1    Batch                                    object        
 2    Sample Date (yyyy-mm-dd)                 datetime64[ns]
 3    Location                                 object        
 4    Density (kg/m³) [ASTM D5002]             float32       
 5    Gravity (°API) [ASTM D5002]              float32       
 6    Sulphur (wt%) [ASTM D4294]               float32       
 7    Micro Carbon Residue (wt%) [ASTM D4530]  float32       
 8    Sediment (ppmw) [ASTM D4807]             float32       
 9    Total Acid Number (mgKOH/g) [ASTM D664]  float32       
 10   Salt (ptb) [ASTM D3230]                  float32       
 11   Nickel (mg/kg) [ASTM D5708A]             float32       
 12   Vanadium (mg/kg) [ASTM D5708A]           float32       
 13   C1 Methane (vol%) [GC/FID]               float32       
 14   C2 Ethane (vol%) [GC/FID]                float32       
 15   C3 Propane (vol%) [GC/FID]               float32       
 16   iC4 iso-Butane (vol%) [GC/FID]           float32       
 17   nC4 n-Butane (vol%) [GC/FID]             float32       
 18   iC5 iso-Pentane (vol%) [GC/FID]          float32       
 19   nC5 n-Pentane (vol%) [GC/FID]            float32       
 20   C6 Hexanes (vol%) [GC/FID]               float32       
 21   C7 Heptanes (vol%) [GC/FID]              float32       
 22   C8 Octanes (vol%) [GC/FID]               float32       
 23   C9 Nonanes (vol%) [GC/FID]               float32       
 24   C10 Decanes (vol%) [GC/FID]              float32       
 25   Nitrogen - C4- (vol%) [GPA 2177M]        float32       
 26   Carbon Dioxide - C4- (vol%) [GPA 2177M]  float32       
 27   C1 Methane - C4- (vol%) [GPA 2177M]      float32       
 28   C2 Ethane - C4- (vol%) [GPA 2177M]       float32       
 29   C3 Propane - C4- (vol%) [GPA 2177M]      float32       
 30   iC4 iso-Butane - C4- (vol%) [GPA 2177M]  float32       
 31   nC4 n-Butane - C4- (vol%) [GPA 2177M]    float32       
 32   IBP (°C) [ASTM D7169]                    float32       
 33   1 Mass% Recovered (°C) [ASTM D7169]      float32       
 34   2 Mass% Recovered (°C) [ASTM D7169]      float32       
 35   3 Mass% Recovered (°C) [ASTM D7169]      float32       
 36   4 Mass% Recovered (°C) [ASTM D7169]      float32       
 37   5 Mass% Recovered (°C) [ASTM D7169]      float32       
 38   6 Mass% Recovered (°C) [ASTM D7169]      float32       
 39   7 Mass% Recovered (°C) [ASTM D7169]      float32       
 40   8 Mass% Recovered (°C) [ASTM D7169]      float32       
 41   9 Mass% Recovered (°C) [ASTM D7169]      float32       
 42   10 Mass% Recovered (°C) [ASTM D7169]     float32       
 43   11 Mass% Recovered (°C) [ASTM D7169]     float32       
 44   12 Mass% Recovered (°C) [ASTM D7169]     float32       
 45   13 Mass% Recovered (°C) [ASTM D7169]     float32       
 46   14 Mass% Recovered (°C) [ASTM D7169]     float32       
 47   15 Mass% Recovered (°C) [ASTM D7169]     float32       
 48   16 Mass% Recovered (°C) [ASTM D7169]     float32       
 49   17 Mass% Recovered (°C) [ASTM D7169]     float32       
 50   18 Mass% Recovered (°C) [ASTM D7169]     float32       
 51   19 Mass% Recovered (°C) [ASTM D7169]     float32       
 52   20 Mass% Recovered (°C) [ASTM D7169]     float32       
 53   21 Mass% Recovered (°C) [ASTM D7169]     float32       
 54   22 Mass% Recovered (°C) [ASTM D7169]     float32       
 55   23 Mass% Recovered (°C) [ASTM D7169]     float32       
 56   24 Mass% Recovered (°C) [ASTM D7169]     float32       
 57   25 Mass% Recovered (°C) [ASTM D7169]     float32       
 58   26 Mass% Recovered (°C) [ASTM D7169]     float32       
 59   27 Mass% Recovered (°C) [ASTM D7169]     float32       
 60   28 Mass% Recovered (°C) [ASTM D7169]     float32       
 61   29 Mass% Recovered (°C) [ASTM D7169]     float32       
 62   30 Mass% Recovered (°C) [ASTM D7169]     float32       
 63   31 Mass% Recovered (°C) [ASTM D7169]     float32       
 64   32 Mass% Recovered (°C) [ASTM D7169]     float32       
 65   33 Mass% Recovered (°C) [ASTM D7169]     float32       
 66   34 Mass% Recovered (°C) [ASTM D7169]     float32       
 67   35 Mass% Recovered (°C) [ASTM D7169]     float32       
 68   36 Mass% Recovered (°C) [ASTM D7169]     float32       
 69   37 Mass% Recovered (°C) [ASTM D7169]     float32       
 70   38 Mass% Recovered (°C) [ASTM D7169]     float32       
 71   39 Mass% Recovered (°C) [ASTM D7169]     float32       
 72   40 Mass% Recovered (°C) [ASTM D7169]     float32       
 73   41 Mass% Recovered (°C) [ASTM D7169]     float32       
 74   42 Mass% Recovered (°C) [ASTM D7169]     float32       
 75   43 Mass% Recovered (°C) [ASTM D7169]     float32       
 76   44 Mass% Recovered (°C) [ASTM D7169]     float32       
 77   45 Mass% Recovered (°C) [ASTM D7169]     float32       
 78   46 Mass% Recovered (°C) [ASTM D7169]     float32       
 79   47 Mass% Recovered (°C) [ASTM D7169]     float32       
 80   48 Mass% Recovered (°C) [ASTM D7169]     float32       
 81   49 Mass% Recovered (°C) [ASTM D7169]     float32       
 82   50 Mass% Recovered (°C) [ASTM D7169]     float32       
 83   51 Mass% Recovered (°C) [ASTM D7169]     float32       
 84   52 Mass% Recovered (°C) [ASTM D7169]     float32       
 85   53 Mass% Recovered (°C) [ASTM D7169]     float32       
 86   54 Mass% Recovered (°C) [ASTM D7169]     float32       
 87   55 Mass% Recovered (°C) [ASTM D7169]     float32       
 88   56 Mass% Recovered (°C) [ASTM D7169]     float32       
 89   57 Mass% Recovered (°C) [ASTM D7169]     float32       
 90   58 Mass% Recovered (°C) [ASTM D7169]     float32       
 91   59 Mass% Recovered (°C) [ASTM D7169]     float32       
 92   60 Mass% Recovered (°C) [ASTM D7169]     float32       
 93   61 Mass% Recovered (°C) [ASTM D7169]     float32       
 94   62 Mass% Recovered (°C) [ASTM D7169]     float32       
 95   63 Mass% Recovered (°C) [ASTM D7169]     float32       
 96   64 Mass% Recovered (°C) [ASTM D7169]     float32       
 97   65 Mass% Recovered (°C) [ASTM D7169]     float32       
 98   66 Mass% Recovered (°C) [ASTM D7169]     float32       
 99   67 Mass% Recovered (°C) [ASTM D7169]     float32       
 100  68 Mass% Recovered (°C) [ASTM D7169]     float32       
 101  69 Mass% Recovered (°C) [ASTM D7169]     float32       
 102  70 Mass% Recovered (°C) [ASTM D7169]     float32       
 103  71 Mass% Recovered (°C) [ASTM D7169]     float32       
 104  72 Mass% Recovered (°C) [ASTM D7169]     float32       
 105  73 Mass% Recovered (°C) [ASTM D7169]     float32       
 106  74 Mass% Recovered (°C) [ASTM D7169]     float32       
 107  75 Mass% Recovered (°C) [ASTM D7169]     float32       
 108  76 Mass% Recovered (°C) [ASTM D7169]     float32       
 109  77 Mass% Recovered (°C) [ASTM D7169]     float32       
 110  78 Mass% Recovered (°C) [ASTM D7169]     float32       
 111  79 Mass% Recovered (°C) [ASTM D7169]     float32       
 112  80 Mass% Recovered (°C) [ASTM D7169]     float32       
 113  81 Mass% Recovered (°C) [ASTM D7169]     float32       
 114  82 Mass% Recovered (°C) [ASTM D7169]     float32       
 115  83 Mass% Recovered (°C) [ASTM D7169]     float32       
 116  84 Mass% Recovered (°C) [ASTM D7169]     float32       
 117  85 Mass% Recovered (°C) [ASTM D7169]     float32       
 118  86 Mass% Recovered (°C) [ASTM D7169]     float32       
 119  87 Mass% Recovered (°C) [ASTM D7169]     float32       
 120  88 Mass% Recovered (°C) [ASTM D7169]     float32       
 121  89 Mass% Recovered (°C) [ASTM D7169]     float32       
 122  90 Mass% Recovered (°C) [ASTM D7169]     float32       
 123  91 Mass% Recovered (°C) [ASTM D7169]     float32       
 124  92 Mass% Recovered (°C) [ASTM D7169]     float32       
 125  93 Mass% Recovered (°C) [ASTM D7169]     float32       
 126  94 Mass% Recovered (°C) [ASTM D7169]     float32       
 127  95 Mass% Recovered (°C) [ASTM D7169]     float32       
 128  96 Mass% Recovered (°C) [ASTM D7169]     float32       
 129  97 Mass% Recovered (°C) [ASTM D7169]     float32       
 130  98 Mass% Recovered (°C) [ASTM D7169]     float32       
 131  99 Mass% Recovered (°C) [ASTM D7169]     float32       
 132  target_label                             object        
dtypes: datetime64[ns](1), float32(128), object(4)
memory usage: 4.5+ MB

Feature Descriptions¶

Feature Description
Crude Name of the crude oil grade.
Batch Shipment or lot identifier for the sample.
Sample Date (yyyy-mm-dd) Date the crude sample was collected.
Location Site or facility where the sample was taken.
Density (kg/m³) [ASTM D5002] Mass per unit volume; indicates heaviness of crude.
Gravity (°API) [ASTM D5002] Measure of crude lightness; inverse of density.
Sulphur (wt%) [ASTM D4294] Sulphur content; affects refining and emissions.
Micro Carbon Residue (wt%) [ASTM D4530] Carbon left after pyrolysis; indicator of coke-forming tendency.
Sediment (ppmw) [ASTM D4807] Solid impurities or particles in crude.
Total Acid Number (mgKOH/g) [ASTM D664] Measure of acidity; indicates corrosive potential.
Salt (ptb) [ASTM D3230] Salt concentration; impacts corrosion and desalting needs.
Nickel (mg/kg) [ASTM D5708A] Metal contaminant affecting catalyst life.
Vanadium (mg/kg) [ASTM D5708A] Metal impurity impacting refining catalysts.
C1–C10 Components (vol%) [GC/FID] Light hydrocarbon composition (methane to decane) by gas chromatography; indicates volatility and gas content.
iC4–nC5 (vol%) [GC/FID] Branched and normal butane/pentane fractions; measure of light-end structure.
C6–C10 (vol%) [GC/FID] Heavier paraffins; affect vapor pressure and yield profiles.
Nitrogen – C4- (vol%) [GPA 2177M] Nitrogen content in light gas fraction.
Carbon Dioxide – C4- (vol%) [GPA 2177M] CO₂ concentration in gas fraction.
C1–nC4 – C4- (vol%) [GPA 2177M] Light hydrocarbon breakdown (methane through butane) in gas stream.
IBP (°C) [ASTM D7169] Initial boiling point; start of vaporization during distillation.
1–99 Mass% Recovered (°C) [ASTM D7169] Distillation temperatures where 1–99% of sample mass is vaporized; describes boiling range and fractionation behavior.
target_label CrudeMonitor crude group name

EDA¶

Column Groups¶

In [167]:
# Categorical columns
categorical_cols = ['Crude', 'Batch', 'Sample Date (yyyy-mm-dd)', 'Location', 'target_label']

# Columns that describe the quality of the crude oil
crude_quality_cols = df.loc[:,'Density (kg/m³) [ASTM D5002]':'IBP (°C) [ASTM D7169]'].columns.to_list()

# Columns that describe distillation temperatures for the crude oil
dist_temp_cols = df.loc[:,'1 Mass% Recovered (°C) [ASTM D7169]':'99 Mass% Recovered (°C) [ASTM D7169]'].columns.to_list()

Categorical Frequency Charts¶

In [168]:
alt.Chart(df[categorical_cols], title=alt.Title('Frequency of Crude Grade Categories', fontSize=15, anchor='start')).mark_bar().encode(
    alt.Y('target_label:N').sort('-x').title(None),
    alt.X('count():Q').title(None)
).properties(height=300, width=300)
Out[168]:
In [169]:
alt.Chart(df[categorical_cols], title=alt.Title('Frequency of Location Categories', fontSize=15, anchor='start')).mark_bar().encode(
    alt.Y('Location:N').sort('-x').title(None),
    alt.X('count():Q').title(None)
).properties(height=300, width=400)
Out[169]:

Missing Values¶

In [170]:
df_na = pd.DataFrame({'Values Missing':df.isna().sum()}).reset_index()

alt.Chart(df_na, title=alt.Title('Counts of Missing Values', fontSize=15, anchor='start')).mark_bar().encode(
    alt.Y('Values Missing:N').title('Binned Count of Missing Values').bin(True),
    alt.X('count()').title('Count of Columns'),
).properties(height=300, width=400)
Out[170]:
In [171]:
# Create heatmap to visualize where data is missing in the dataset
fig, ax = plt.subplots(1,1, figsize=(30,30), sharey=True) 
# plt.suptitle('Rows with Missing Values', y=0.9, fontsize = 25)
sns.heatmap(df.T.isna(), cmap=sns.color_palette('blend:#ffffff,black'), cbar=False, ax=ax)

plt.show()
No description has been provided for this image

Numeric Histograms¶

In [185]:
# Function to plot a set of histograms for a subset of columns
def plot_column_histograms(input_df, subset, chart_title, col_count, row_sample_size):
    # Melt the numeric columns into one column
    df_melt = input_df[subset + ['target_label']].melt(id_vars='target_label', var_name='feature', value_name='value').sample(n=row_sample_size)

    # Create a base Altair histogram chart
    chart = alt.Chart(df_melt).mark_bar(opacity=0.6, binSpacing=0).encode(
        x = alt.X('value:Q').axis(title='').bin(maxbins=50),
        y = alt.Y('count():Q').axis(title='').stack(None),
        color = alt.Color('target_label:N').scale(scheme='viridis').title('Crude Grade').legend(orient='left', direction='vertical')
    ).properties(
        width=750 / col_count,
        height=150
    )

    # Display a histogram for each numeric_columns
    chart = alt.ConcatChart(
        title=alt.Title(f'{chart_title}', fontSize=20),
        concat=[chart.transform_filter(alt.datum.feature == value).properties(title=value) for value in subset],
        columns=col_count,
        # spacing=1
    ).configure_title(
        fontSize=10
    ).resolve_axis(
        x='independent',
        y='independent'
    ).resolve_scale(
        x='independent', 
        y='independent'
    )

    return chart

plot_column_histograms(df, crude_quality_cols, 'Crude Quality Columns Data Distributions', 5, 100000)
Out[185]:
In [173]:
plot_column_histograms(df, crude_quality_cols, 'Crude Quality Columns Data Distributions', 5, 100000)
Out[173]:
In [186]:
[f'{i} Mass% Recovered (°C) [ASTM D7169]' for i in range(5,len(dist_temp_cols), 5)]

plot_column_histograms(df, [f'{i} Mass% Recovered (°C) [ASTM D7169]' for i in range(5,len(dist_temp_cols), 5)], 'Crude Quality Columns Data Distributions', 5, 100000)
Out[186]:
In [176]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[crude_quality_cols + dist_temp_cols].drop('C1 Methane (vol%) [GC/FID]', axis=1))
df_scaled = pd.DataFrame(df_scaled, columns=scaler.get_feature_names_out())
df_scaled
Out[176]:
Density (kg/m³) [ASTM D5002] Gravity (°API) [ASTM D5002] Sulphur (wt%) [ASTM D4294] Micro Carbon Residue (wt%) [ASTM D4530] Sediment (ppmw) [ASTM D4807] Total Acid Number (mgKOH/g) [ASTM D664] Salt (ptb) [ASTM D3230] Nickel (mg/kg) [ASTM D5708A] Vanadium (mg/kg) [ASTM D5708A] C2 Ethane (vol%) [GC/FID] ... 90 Mass% Recovered (°C) [ASTM D7169] 91 Mass% Recovered (°C) [ASTM D7169] 92 Mass% Recovered (°C) [ASTM D7169] 93 Mass% Recovered (°C) [ASTM D7169] 94 Mass% Recovered (°C) [ASTM D7169] 95 Mass% Recovered (°C) [ASTM D7169] 96 Mass% Recovered (°C) [ASTM D7169] 97 Mass% Recovered (°C) [ASTM D7169] 98 Mass% Recovered (°C) [ASTM D7169] 99 Mass% Recovered (°C) [ASTM D7169]
0 -1.429622 1.456850 -1.180993 -1.241242 NaN NaN NaN -1.416204 -1.354156 1.127020 ... 0.249568 0.368546 0.586741 0.978241 NaN NaN NaN NaN NaN NaN
1 -1.402459 1.419774 -1.194246 -1.291202 NaN NaN NaN NaN NaN -0.875153 ... -0.269786 -0.198702 -0.094364 0.008309 0.136085 0.301797 0.600143 1.280640 NaN NaN
2 -1.350394 1.370337 -1.180993 -1.216262 NaN NaN NaN -1.390273 -1.344211 -0.875153 ... -0.128240 -0.074518 -0.005433 0.082198 0.203923 0.340658 0.555596 0.831786 1.233326 1.808874
3 -1.422831 1.444492 -1.214125 -1.188783 NaN NaN NaN -1.385952 -1.339239 3.796584 ... -0.415588 -0.348145 -0.228283 -0.279965 -0.150921 0.014014 0.263324 0.609673 0.877329 0.851515
4 -1.386613 1.395056 -1.194246 -1.193779 NaN NaN NaN -1.360022 -1.322664 1.127020 ... 0.073967 0.220157 0.273914 -0.155081 0.016065 -0.133029 0.046021 0.251053 0.510860 0.676291
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8556 1.121502 -1.089098 0.727410 -1.391122 NaN 3.273269 NaN -1.312483 -1.208300 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
8557 1.130557 -1.101457 0.700905 -1.441082 -1.269125 3.511563 -0.740331 -1.351378 -1.243107 NaN ... -0.800847 -0.805943 -0.788023 -0.757646 -0.694668 -0.611967 -0.451602 -0.239448 0.069789 0.451686
8558 1.112448 -1.076739 0.747290 -1.241242 NaN 3.441477 NaN -1.200119 -1.087306 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
8559 1.105657 -1.076739 0.687652 -1.391122 NaN 3.119080 NaN -1.351378 -1.238134 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
8560 1.123766 -1.089098 0.727410 -1.316182 -1.205620 3.133097 -0.725770 -1.277910 -1.166864 NaN ... -0.777433 -0.776475 -0.753497 -0.712896 -0.634135 -0.516389 -0.316875 -0.015020 0.453272 1.026739

8561 rows × 127 columns

In [177]:
# Correlation Heatmap
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(
    df[crude_quality_cols + dist_temp_cols].corr(),
    cmap='viridis_r',
    mask=abs(df[crude_quality_cols + dist_temp_cols].corr())<=0.5, # Plot only absolute correlation values that are >=0.5
    square=True,
    vmin=-1, vmax=1,
    xticklabels=True, yticklabels=True,
    cbar_kws={'location':'bottom', 'aspect':60, 'shrink':0.6}
)

plt.xticks(fontsize=5) 
plt.yticks(fontsize=5) 
plt.title('Correlation Heatmap\n(where absolute correlation values are >=0.5)')
plt.show()
No description has been provided for this image

Data Cleaning¶

Add Datetime Columns¶

In [178]:
# Add Months and Years columns
df['Months'] = (df['Sample Date (yyyy-mm-dd)'].dt.to_period('M') - df['Sample Date (yyyy-mm-dd)'].min().to_period('M')).apply(lambda x: x.n)
df['Year'] = df['Sample Date (yyyy-mm-dd)'].dt.year

# Update the crude_quality_cols list to add Months and Year columns
categorical_cols = categorical_cols + ['Months', 'Year']
Out[178]:
['Crude',
 'Batch',
 'Sample Date (yyyy-mm-dd)',
 'Location',
 'target_label',
 'Months',
 'Year']

Drop Cols¶

In [180]:
# Define the set of crude_quality_cols to drop
crude_quality_cols_to_drop = [
    'Sediment (ppmw) [ASTM D4807]',
    'Total Acid Number (mgKOH/g) [ASTM D664]',
    'Salt (ptb) [ASTM D3230]',
    'C1 Methane (vol%) [GC/FID]',
    'Nitrogen - C4- (vol%) [GPA 2177M]',
    'Carbon Dioxide - C4- (vol%) [GPA 2177M]',
    'C1 Methane - C4- (vol%) [GPA 2177M]',
    'C2 Ethane - C4- (vol%) [GPA 2177M]',
    'C3 Propane - C4- (vol%) [GPA 2177M]',
    'iC4 iso-Butane - C4- (vol%) [GPA 2177M]',
    'nC4 n-Butane - C4- (vol%) [GPA 2177M]'
]

# Drop columns from dataset
df = df.drop(crude_quality_cols_to_drop, axis=1)

# Update the crude_quality_cols list to remove the crude_quality_cols_to_drop
crude_quality_cols = [col for col in crude_quality_cols if col not in crude_quality_cols_to_drop]

# Define categorical_cols to drop
categorical_cols_to_drop = [
    'Crude',
    'Batch',
    'Sample Date (yyyy-mm-dd)'
]

# Drop categorical columns
df = df.drop(categorical_cols_to_drop, axis=1)

# Update categorical_cols to remove the dropped columns
categorical_cols = [col for col in categorical_cols if col not in categorical_cols_to_drop]

categorical_cols
Out[180]:
['Location', 'target_label', 'Months', 'Year']

Impute Values¶

In [182]:
# Create a KNNImputer instance which uses the 10 nearest neighbors to the missing value to calculate an imputed mean value
imputer = KNNImputer(n_neighbors=10, weights='distance')
df.loc[:, crude_quality_cols + dist_temp_cols] = imputer.fit_transform(df[crude_quality_cols + dist_temp_cols])
df
Out[182]:
Location Density (kg/m³) [ASTM D5002] Gravity (°API) [ASTM D5002] Sulphur (wt%) [ASTM D4294] Micro Carbon Residue (wt%) [ASTM D4530] Nickel (mg/kg) [ASTM D5708A] Vanadium (mg/kg) [ASTM D5708A] C2 Ethane (vol%) [GC/FID] C3 Propane (vol%) [GC/FID] iC4 iso-Butane (vol%) [GC/FID] ... 93 Mass% Recovered (°C) [ASTM D7169] 94 Mass% Recovered (°C) [ASTM D7169] 95 Mass% Recovered (°C) [ASTM D7169] 96 Mass% Recovered (°C) [ASTM D7169] 97 Mass% Recovered (°C) [ASTM D7169] 98 Mass% Recovered (°C) [ASTM D7169] 99 Mass% Recovered (°C) [ASTM D7169] target_label Months Year
0 Edmonton 825.400024 39.799999 0.36 1.30 1.900000 2.200000 0.050000 0.440000 0.58 ... 670.900024 680.556519 707.183411 695.826843 697.359253 694.271790 695.308594 MSW Feeder 24 2001
1 Edmonton 826.599976 39.500000 0.34 1.10 5.285436 8.897344 0.020000 0.280000 0.25 ... 577.700012 590.500000 606.400024 630.000000 684.000000 697.202332 686.257935 MSW Feeder 25 2001
2 Edmonton 828.900024 39.099998 0.36 1.40 2.500000 2.800000 0.020000 0.330000 0.31 ... 584.799988 597.000000 610.099976 625.900024 645.200012 670.200012 710.099976 MSW Feeder 26 2002
3 Enbridge 825.700012 39.700001 0.31 1.51 2.600000 3.100000 0.090000 0.440000 0.30 ... 550.000000 563.000000 579.000000 599.000000 626.000000 643.000000 650.000000 MSW Feeder 0 1999
4 Enbridge 827.299988 39.299999 0.34 1.49 3.200000 4.100000 0.050000 0.330000 0.26 ... 562.000000 579.000000 565.000000 579.000000 595.000000 615.000000 639.000000 MSW Feeder 2 2000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8556 Hardisty 938.099976 19.200001 3.24 0.70 4.300000 11.000000 0.025617 0.040017 0.08 ... 513.449951 532.250000 568.950012 612.099976 528.099976 564.500000 634.900024 Heavy Low Resid 306 2025
8557 Hardisty 938.500000 19.100000 3.20 0.50 3.400000 8.900000 0.030000 0.042588 0.08 ... 504.100006 510.899994 519.400024 533.200012 552.599976 581.299988 624.900024 Heavy Low Resid 307 2025
8558 Hardisty 937.700012 19.299999 3.27 1.30 6.900000 18.299999 0.026547 0.040270 0.08 ... 537.855408 562.538757 589.320740 603.108887 601.471741 592.708862 647.631348 Heavy Low Resid 308 2025
8559 Hardisty 937.400024 19.299999 3.18 0.70 3.400000 9.200000 0.025407 0.034707 0.07 ... 513.706543 529.208740 541.829956 576.329773 564.684692 597.023132 632.330750 Heavy Low Resid 309 2025
8560 Hardisty 938.200012 19.200001 3.24 1.00 5.100000 13.500000 0.030000 0.043530 0.12 ... 508.399994 516.700012 528.500000 545.599976 572.000000 610.599976 661.000000 Heavy Low Resid 310 2025

8561 rows × 121 columns

Pre-Processing¶

In [183]:
# Define the feature matrix
X = df[['Months', 'Year', 'Location'] + crude_quality_cols + dist_temp_cols]

# Define the target label values
y_true = df['target_label']

# Create a preprocessor
preprocessor = ColumnTransformer(
    [('num1', Pipeline([('scaler', StandardScaler())]), ['Months', 'Year']),
     ('num2', Pipeline([('scaler', StandardScaler())]), crude_quality_cols),
     ('num3', Pipeline([('scaler', StandardScaler())]), dist_temp_cols),
     ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), ['Location'])],
    remainder='drop'
).fit(X)

# Preprocess the feature matrix
X_transform = preprocessor.transform(X)

Data Source

  • Func to pull data by group
  • Replace "" with np.nan
  • Add target label column
  • Func to store df

Data Summary

  • Shape
  • Datatypes
  • Feature/column descriptions

EDA

  • Create column groups
  • Missing values bar chart
  • Categorical cols (incl. target) frequency bar charts
  • Numeric (temp replace non-numeric values and standardize):
    • Histograms (colored by target labels)
    • Correlation heatmap

Data Cleaning

  • Split into X and y
    • y = TargetEncoder target labels
  • Create Pre-processing ColumnTransformer pipeline
    • Custom FunctionTransformers
      • Drop columns that aren't needed or contain large amounts of missing values
      • Add datetime columns (months, year)
    • KNNImputer numeric
    • SimpleImputer categorical
    • StandardScaler numeric
    • OneHotEncoder categorical
  • Apply preprocessing pipeline to dataframe

Modeling

  • K-means
  • Agglomerative
  • GMM

Evaluation

  • Scoring metrics
    • Internal
      • Silhouette scores
      • DBI
      • CHI
    • External
      • ARI
      • NMI
      • FMI
  • Plots
    • Silhouette scores
    • PCA 2D scatter plot
    • t-SNE 2D scatter plot
    • Classification confusion matrices
  • Compare to Supervised Classifiers
    • Multinomial Logistic
    • SVC
    • Gradient Boosting
    • Classification confusion matrices